1.Load Data¶

In [1]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from IPython.display import display
from plotly.subplots import make_subplots
import plotly.express as px
import plotly.graph_objects as go
In [2]:
%%time
df = dd.read_parquet(r"C:\Users\prath\OneDrive\Desktop\microsoft_malware_prediction\Data Exploration\train.parquet")
df = df.compute()
CPU times: total: 22.7 s
Wall time: 1min 38s
In [3]:
print(df.shape)
(8921483, 83)
In [4]:
display(df.head())
MachineIdentifier ProductName EngineVersion AppVersion AvSigVersion IsBeta RtpStateBitfield IsSxsPassiveMode DefaultBrowsersIdentifier AVProductStatesIdentifier ... Census_FirmwareVersionIdentifier Census_IsSecureBootEnabled Census_IsWIMBootEnabled Census_IsVirtualDevice Census_IsTouchEnabled Census_IsPenCapable Census_IsAlwaysOnAlwaysConnectedCapable Wdft_IsGamer Wdft_RegionIdentifier HasDetections
0 0000028988387b115f69f31a3bf04f09 win8defender 1.1.15100.1 4.18.1807.18075 1.273.1735.0 0 7.0 0 NaN 53447.0 ... 36144.0 0 NaN 0.0 0 0 0.0 0.0 10.0 0
1 000007535c3f730efa9ea0b7ef1bd645 win8defender 1.1.14600.4 4.13.17134.1 1.263.48.0 0 7.0 0 NaN 53447.0 ... 57858.0 0 NaN 0.0 0 0 0.0 0.0 8.0 0
2 000007905a28d863f6d0d597892cd692 win8defender 1.1.15100.1 4.18.1807.18075 1.273.1341.0 0 7.0 0 NaN 53447.0 ... 52682.0 0 NaN 0.0 0 0 0.0 0.0 3.0 0
3 00000b11598a75ea8ba1beea8459149f win8defender 1.1.15100.1 4.18.1807.18075 1.273.1527.0 0 7.0 0 NaN 53447.0 ... 20050.0 0 NaN 0.0 0 0 0.0 0.0 3.0 1
4 000014a5f00daa18e76b81417eeb99fc win8defender 1.1.15100.1 4.18.1807.18075 1.273.1379.0 0 7.0 0 NaN 53447.0 ... 19844.0 0 0.0 0.0 0 0 0.0 0.0 1.0 1

5 rows × 83 columns

1.2 Check Target:¶

In [5]:
fig = px.bar(df["HasDetections"].value_counts().reset_index(), 
             x="index", y="HasDetections", color="index", title="Has Detection(target)")
fig.show()
In [6]:
df["HasDetections"].value_counts()
Out[6]:
0    4462591
1    4458892
Name: HasDetections, dtype: int64

we have balanced target¶

2. Checking Dataset¶

  • Checking for null value
In [7]:
%%time
stats = []
for col in df.columns:
    stats.append((col, df[col].nunique(), df[col].isnull().sum() * 100 / df.shape[0], df[col].value_counts(normalize=True, dropna=False).values[0] * 100, df[col].dtype))

stats_df = pd.DataFrame(stats, columns=['Feature', 'Unique_values', 'Percentage of missing values', 'Percentage of values in the biggest category', 'type'])
stats_df.sort_values('Percentage of missing values', ascending=False)
CPU times: total: 14 s
Wall time: 16.4 s
Out[7]:
Feature Unique_values Percentage of missing values Percentage of values in the biggest category type
28 PuaMode 2 99.974119 99.974119 category
41 Census_ProcessorClass 3 99.589407 99.589407 category
8 DefaultBrowsersIdentifier 2017 95.141637 95.141637 float32
68 Census_IsFlightingInternal 2 83.044030 83.044030 float32
52 Census_InternalBatteryType 78 71.046809 71.046809 category
... ... ... ... ... ...
1 ProductName 6 0.000000 98.935569 category
45 Census_HasOpticalDiskDrive 2 0.000000 92.281272 int8
54 Census_OSVersion 469 0.000000 15.845202 category
55 Census_OSArchitecture 3 0.000000 90.858045 category
82 HasDetections 2 0.000000 50.020731 int8

83 rows × 5 columns

In [8]:
stats_df = stats_df.astype({'type': 'string'})

table_data = stats_df.head(50)

fig = go.Figure(data=[go.Table(
    header=dict(values=list(table_data.columns),
                fill_color='paleturquoise',
                align='left',
                font=dict(color='black', size=12)),
    cells=dict(values=[table_data[column] for column in table_data.columns],
               fill_color='lavender',
               align='left',
               font=dict(color='black', size=11)))
])

fig.show()

Observation from Table:¶

  • PuaMode and Census_ProcessorClass have 99+ missing value which are not needed for further EDA and ML purposes.
  • DefaultBrowsersIdentifier column has 95% missing value and this will be removed too.
In [9]:
train= pd.DataFrame(df)
good_cols = list(df.columns)
for col in df.columns:
    rate = df[col].value_counts(normalize=True, dropna=False).values[0]
    if rate > 0.9: # Removing Missing Values if 
        good_cols.remove(col)
In [10]:
train= df[good_cols]
In [11]:
train.head()
Out[11]:
MachineIdentifier EngineVersion AppVersion AvSigVersion AVProductStatesIdentifier AVProductsInstalled CountryIdentifier CityIdentifier OrganizationIdentifier GeoNameIdentifier ... Census_IsFlightingInternal Census_ThresholdOptIn Census_FirmwareManufacturerIdentifier Census_FirmwareVersionIdentifier Census_IsSecureBootEnabled Census_IsWIMBootEnabled Census_IsTouchEnabled Wdft_IsGamer Wdft_RegionIdentifier HasDetections
0 0000028988387b115f69f31a3bf04f09 1.1.15100.1 4.18.1807.18075 1.273.1735.0 53447.0 1.0 29 128035.0 18.0 35.0 ... NaN NaN 628.0 36144.0 0 NaN 0 0.0 10.0 0
1 000007535c3f730efa9ea0b7ef1bd645 1.1.14600.4 4.13.17134.1 1.263.48.0 53447.0 1.0 93 1482.0 18.0 119.0 ... NaN NaN 628.0 57858.0 0 NaN 0 0.0 8.0 0
2 000007905a28d863f6d0d597892cd692 1.1.15100.1 4.18.1807.18075 1.273.1341.0 53447.0 1.0 86 153579.0 18.0 64.0 ... NaN NaN 142.0 52682.0 0 NaN 0 0.0 3.0 0
3 00000b11598a75ea8ba1beea8459149f 1.1.15100.1 4.18.1807.18075 1.273.1527.0 53447.0 1.0 88 20710.0 NaN 117.0 ... NaN NaN 355.0 20050.0 0 NaN 0 0.0 3.0 1
4 000014a5f00daa18e76b81417eeb99fc 1.1.15100.1 4.18.1807.18075 1.273.1379.0 53447.0 1.0 18 37376.0 NaN 277.0 ... 0.0 0.0 355.0 19844.0 0 0.0 0 0.0 1.0 1

5 rows × 57 columns

Plot function data¶

In [12]:
def plot_categorical_feature(col, only_bars=False, top_n=10, by_touch=False):
    top_n = top_n if train[col].nunique() > top_n else train[col].nunique()
    print(f"{col} has {train[col].nunique()} unique values and type: {train[col].dtype}.")
    print(train[col].value_counts(normalize=True, dropna=False).head())
    if not by_touch:
        if not only_bars:
            df = train.groupby([col]).agg({'HasDetections': ['count', 'mean']})
            df = df.sort_values(('HasDetections', 'count'), ascending=False).head(top_n).sort_index()
            fig = make_subplots(specs=[[{"secondary_y": True}]])
            fig.add_trace(go.Bar(x=df.index, y=df['HasDetections']['count'].values, name='counts'), secondary_y=False)
            fig.add_trace(go.Scatter(x=df.index, y=df['HasDetections']['mean'], name='Detections rate'), secondary_y=True)
            fig.update_layout(title=f"Counts of {col} by top-{top_n} categories and mean target value",
                              xaxis=dict(title=f'{col}',
                                         showgrid=False,
                                         zeroline=False,
                                         showline=False,),
                              yaxis=dict(title='Counts',
                                         showgrid=False,
                                         zeroline=False,
                                         showline=False,),
                              yaxis2=dict(title='Detections rate', overlaying='y', side='right'),
                              legend=dict(orientation="v"))
        else:
            top_cat = list(train[col].value_counts(dropna=False).index[:top_n])
            df0 = train.loc[(train[col].isin(top_cat)) & (train['HasDetections'] == 1), col].value_counts().head(10).sort_index()
            df1 = train.loc[(train[col].isin(top_cat)) & (train['HasDetections'] == 0), col].value_counts().head(10).sort_index()
            fig = go.Figure()
            fig.add_trace(go.Bar(x=df0.index, y=df0.values, name='Has Detections'))
            fig.add_trace(go.Bar(x=df1.index, y=df1.values, name='No Detections'))
            fig.update_layout(title=f"Counts of {col} by top-{top_n} categories",
                              xaxis=dict(title=f'{col}',
                                         showgrid=False,
                                         zeroline=False,
                                         showline=False,),
                              yaxis=dict(title='Counts',
                                         showgrid=False,
                                         zeroline=False,
                                         showline=False,),
                              legend=dict(orientation="v"), barmode='group')

        fig.show()
    else:
        top_n = 10
        top_cat = list(train[col].value_counts(dropna=False).index[:top_n])
        df = train.loc[train[col].isin(top_cat)]

        df1 = train.loc[train['Census_IsTouchEnabled'] == 1]
        df0 = train.loc[train['Census_IsTouchEnabled'] == 0]

        df0_ = df0.groupby([col]).agg({'HasDetections': ['count', 'mean']})
        df0_ = df0_.sort_values(('HasDetections', 'count'), ascending=False).head(top_n).sort_index()
        df1_ = df1.groupby([col]).agg({'HasDetections': ['count', 'mean']})
        df1_ = df1_.loc[df0_.index]

        fig = go.Figure()
        fig.add_trace(go.Bar(
            x=df0_[('HasDetections', 'mean')], y=df0_.index,
            orientation='h', name='Census_IsTouchEnabled=0', marker=dict(color='royalblue')
        ))
        fig.add_trace(go.Bar(
            x=df1_[('HasDetections', 'mean')], y=df1_.index,
            orientation='h', name='Census_IsTouchEnabled=1', marker=dict(color='seagreen')
        ))
        fig.update_layout(
            title=f'Distribution of HasDetections by {col}',
            xaxis_title='% of HasDetections',
            yaxis_title=col,
            legend=dict(x=0.85, y=0.95),
            margin=dict(l=120, r=20, t=50, b=50),
            height=600
        )
        fig.show()
In [13]:
plot_categorical_feature('Census_IsTouchEnabled', True)
Census_IsTouchEnabled has 2 unique values and type: int8.
0    0.874457
1    0.125543
Name: Census_IsTouchEnabled, dtype: float64
In [14]:
plot_categorical_feature('EngineVersion', by_touch=True)
EngineVersion has 70 unique values and type: category.
1.1.15200.1    0.430990
1.1.15100.1    0.412030
1.1.15000.2    0.029728
1.1.14901.4    0.023809
1.1.14600.4    0.018000
Name: EngineVersion, dtype: float64
In [15]:
plot_categorical_feature('AppVersion')
AppVersion has 110 unique values and type: category.
4.18.1807.18075    0.576050
4.18.1806.18062    0.095380
4.12.16299.15      0.040338
4.10.209.0         0.030539
4.13.17134.1       0.028837
Name: AppVersion, dtype: float64
In [16]:
plot_categorical_feature('AvSigVersion')
AvSigVersion has 8531 unique values and type: category.
1.273.1420.0    0.011469
1.263.48.0      0.010987
1.275.1140.0    0.010899
1.275.727.0     0.010362
1.273.371.0     0.009748
Name: AvSigVersion, dtype: float64
In [17]:
plot_categorical_feature('AVProductStatesIdentifier',True,10)
AVProductStatesIdentifier has 28970 unique values and type: float32.
53447.0    0.652870
7945.0     0.053343
47238.0    0.036727
62773.0    0.029901
46413.0    0.012652
Name: AVProductStatesIdentifier, dtype: float64
In [18]:
plot_categorical_feature('AVProductsInstalled',True)
AVProductsInstalled has 8 unique values and type: float32.
1.0    0.695949
2.0    0.275628
3.0    0.023326
NaN    0.004060
4.0    0.000982
Name: AVProductsInstalled, dtype: float64
In [20]:
plot_categorical_feature('CountryIdentifier',True,20)
CountryIdentifier has 222 unique values and type: int16.
43     0.044519
29     0.039006
141    0.037372
93     0.031791
171    0.031449
Name: CountryIdentifier, dtype: float64
In [21]:
plot_categorical_feature('CityIdentifier',True,20)
CityIdentifier has 107366 unique values and type: float32.
NaN         0.036475
130775.0    0.010627
16668.0     0.009503
82373.0     0.009338
10222.0     0.008050
Name: CityIdentifier, dtype: float64
In [22]:
plot_categorical_feature('OrganizationIdentifier', True, by_touch=True)
OrganizationIdentifier has 49 unique values and type: float32.
27.0    0.470377
NaN     0.308415
18.0    0.197745
48.0    0.007156
50.0    0.005100
Name: OrganizationIdentifier, dtype: float64
In [23]:
plot_categorical_feature('GeoNameIdentifier',True)
GeoNameIdentifier has 292 unique values and type: float32.
277.0    0.171712
211.0    0.047432
53.0     0.045823
89.0     0.040441
240.0    0.038846
Name: GeoNameIdentifier, dtype: float64
In [24]:
plot_categorical_feature('LocaleEnglishNameIdentifier', True)
LocaleEnglishNameIdentifier has 276 unique values and type: int16.
75     0.234780
182    0.050450
74     0.046075
42     0.045913
88     0.042058
Name: LocaleEnglishNameIdentifier, dtype: float64
In [25]:
plot_categorical_feature('OsPlatformSubRelease', True, by_touch=True)
OsPlatformSubRelease has 9 unique values and type: category.
rs4    0.438887
rs3    0.280635
rs2    0.087460
rs1    0.081917
th2    0.046136
Name: OsPlatformSubRelease, dtype: float64
In [26]:
plot_categorical_feature('OsBuildLab', True)
OsBuildLab has 663 unique values and type: category.
17134.1.amd64fre.rs4_release.180410-1804                 0.410044
16299.431.amd64fre.rs3_release_svc_escrow.180502-1908    0.140411
16299.15.amd64fre.rs3_release.170928-1534                0.107724
15063.0.amd64fre.rs2_release.170317-1834                 0.080484
17134.1.x86fre.rs4_release.180410-1804                   0.028815
Name: OsBuildLab, dtype: float64
In [27]:
plot_categorical_feature('IeVerIdentifier', True)
IeVerIdentifier has 303 unique values and type: float32.
137.0    0.435560
117.0    0.198166
108.0    0.053174
111.0    0.052438
98.0     0.039726
Name: IeVerIdentifier, dtype: float64
In [28]:
plot_categorical_feature('Census_OEMNameIdentifier', True)
Census_OEMNameIdentifier has 3832 unique values and type: float32.
2668.0    0.144289
2102.0    0.116412
1443.0    0.106432
2206.0    0.103609
585.0     0.100370
Name: Census_OEMNameIdentifier, dtype: float64
In [29]:
plot_categorical_feature('Census_ProcessorCoreCount',True,by_touch=True)
Census_ProcessorCoreCount has 45 unique values and type: float32.
4.0     0.608665
2.0     0.259146
8.0     0.096957
12.0    0.010391
1.0     0.007890
Name: Census_ProcessorCoreCount, dtype: float64
In [30]:
plot_categorical_feature('Census_ProcessorModelIdentifier', True)
Census_ProcessorModelIdentifier has 3428 unique values and type: float32.
2697.0    0.032425
1998.0    0.029972
2660.0    0.021453
2373.0    0.019661
1992.0    0.019249
Name: Census_ProcessorModelIdentifier, dtype: float64
In [31]:
plot_categorical_feature('Census_PrimaryDiskTotalCapacity', True)
Census_PrimaryDiskTotalCapacity has 5735 unique values and type: float64.
476940.0    0.318504
953869.0    0.243881
305245.0    0.053199
122104.0    0.052576
244198.0    0.050696
Name: Census_PrimaryDiskTotalCapacity, dtype: float64
In [32]:
plot_categorical_feature('Census_TotalPhysicalRAM', True,by_touch=True)
Census_TotalPhysicalRAM has 3446 unique values and type: float32.
4096.0     0.458950
8192.0     0.246204
2048.0     0.123015
16384.0    0.059582
6144.0     0.044687
Name: Census_TotalPhysicalRAM, dtype: float64